import factor_analyzer
import pandas as pd
from sklearn.datasets import load_iris
from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline
import plotly.express as px
df = pd.read_csv("tornado.csv")
df.head()
| yr | mo | dy | date | st | mag | inj | fat | slat | slon | elat | elon | len | wid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1950 | 1 | 3 | 1/3/1950 | IL | 3 | 3 | 0 | 39.10 | -89.30 | 39.12 | -89.23 | 3.6 | 130 |
| 1 | 1950 | 1 | 3 | 1/3/1950 | MO | 3 | 3 | 0 | 38.77 | -90.22 | 38.83 | -90.03 | 9.5 | 150 |
| 2 | 1950 | 1 | 3 | 1/3/1950 | OH | 1 | 1 | 0 | 40.88 | -84.58 | 0.00 | 0.00 | 0.1 | 10 |
| 3 | 1950 | 1 | 13 | 1/13/1950 | AR | 3 | 1 | 1 | 34.40 | -94.37 | 0.00 | 0.00 | 0.6 | 17 |
| 4 | 1950 | 1 | 25 | 1/25/1950 | IL | 2 | 0 | 0 | 41.17 | -87.33 | 0.00 | 0.00 | 0.1 | 100 |
df = df.rename(columns={"yr": "Year", "mo": "Month",'dy':'Day','date':'Date','st':'State','mag':'Magnitude',
'inj':'Injuries','fat':'Fatalities','slat':'StartingLatitude','slon':'StartingLatitude',
'elat':'EndingLatitude','elon':'EndingLongitude','len':'Length','wid':'Width'})
df.head()
| Year | Month | Day | Date | State | Magnitude | Injuries | Fatalities | StartingLatitude | StartingLatitude | EndingLatitude | EndingLongitude | Length | Width | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1950 | 1 | 3 | 1/3/1950 | IL | 3 | 3 | 0 | 39.10 | -89.30 | 39.12 | -89.23 | 3.6 | 130 |
| 1 | 1950 | 1 | 3 | 1/3/1950 | MO | 3 | 3 | 0 | 38.77 | -90.22 | 38.83 | -90.03 | 9.5 | 150 |
| 2 | 1950 | 1 | 3 | 1/3/1950 | OH | 1 | 1 | 0 | 40.88 | -84.58 | 0.00 | 0.00 | 0.1 | 10 |
| 3 | 1950 | 1 | 13 | 1/13/1950 | AR | 3 | 1 | 1 | 34.40 | -94.37 | 0.00 | 0.00 | 0.6 | 17 |
| 4 | 1950 | 1 | 25 | 1/25/1950 | IL | 2 | 0 | 0 | 41.17 | -87.33 | 0.00 | 0.00 | 0.1 | 100 |
condition = df['Year'] <= 2006 # Define the condition
df_older = df[condition] # Select rows that meet the condition
df = df[~condition] # Select rows that do not meet the condition
#df_older.head()
df.head()
#df.tail()
| Year | Month | Day | Date | State | Magnitude | Injuries | Fatalities | StartingLatitude | StartingLatitude | EndingLatitude | EndingLongitude | Length | Width | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 49296 | 2007 | 1 | 4 | 1/4/2007 | LA | 1 | 0 | 0 | 30.60 | -91.45 | 30.62 | -91.47 | 1.83 | 75 |
| 49297 | 2007 | 1 | 4 | 1/4/2007 | LA | 1 | 15 | 2 | 29.92 | -91.80 | 30.05 | -91.73 | 15.07 | 100 |
| 49298 | 2007 | 1 | 5 | 1/5/2007 | GA | 0 | 0 | 0 | 33.27 | -84.56 | 33.29 | -84.55 | 1.68 | 200 |
| 49299 | 2007 | 1 | 5 | 1/5/2007 | GA | 0 | 0 | 0 | 31.32 | -82.47 | 31.34 | -82.47 | 2.00 | 100 |
| 49300 | 2007 | 1 | 5 | 1/5/2007 | GA | 1 | 0 | 0 | 33.36 | -84.90 | 33.42 | -84.84 | 5.39 | 200 |
# dimensions
size=df.size
shape=df.shape
dimensions=df.ndim
info=df.info()
print(f"shape: {shape}, size: {size}, dimensions:{dimensions}, info summary: {info}")
<class 'pandas.core.frame.DataFrame'> Int64Index: 18262 entries, 49296 to 67557 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 18262 non-null int64 1 Month 18262 non-null int64 2 Day 18262 non-null int64 3 Date 18262 non-null object 4 State 18262 non-null object 5 Magnitude 18262 non-null int64 6 Injuries 18262 non-null int64 7 Fatalities 18262 non-null int64 8 StartingLatitude 18262 non-null float64 9 StartingLatitude 18262 non-null float64 10 EndingLatitude 18262 non-null float64 11 EndingLongitude 18262 non-null float64 12 Length 18262 non-null float64 13 Width 18262 non-null int64 dtypes: float64(5), int64(7), object(2) memory usage: 2.1+ MB shape: (18262, 14), size: 255668, dimensions:2, info summary: None
#datatypes
types=df.dtypes
types
Year int64 Month int64 Day int64 Date object State object Magnitude int64 Injuries int64 Fatalities int64 StartingLatitude float64 StartingLatitude float64 EndingLatitude float64 EndingLongitude float64 Length float64 Width int64 dtype: object
#checking for nulls
df.isna().sum()
Year 0 Month 0 Day 0 Date 0 State 0 Magnitude 0 Injuries 0 Fatalities 0 StartingLatitude 0 StartingLatitude 0 EndingLatitude 0 EndingLongitude 0 Length 0 Width 0 dtype: int64
df.describe()
| Year | Month | Day | Magnitude | Injuries | Fatalities | StartingLatitude | StartingLatitude | EndingLatitude | EndingLongitude | Length | Width | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 18262.00000 | 18262.000000 | 18262.000000 | 18262.000000 | 18262.000000 | 18262.000000 | 18262.000000 | 18262.000000 | 18262.000000 | 18262.000000 | 18262.000000 | 18262.000000 |
| mean | 2013.92520 | 5.815135 | 16.500876 | 0.285566 | 0.833808 | 0.072226 | 37.161643 | -92.051996 | 36.610713 | -90.557844 | 3.545453 | 161.174242 |
| std | 4.44525 | 2.625002 | 8.749836 | 1.883784 | 16.496789 | 1.591000 | 4.811412 | 8.329815 | 6.601923 | 13.999551 | 6.528550 | 261.827175 |
| min | 2007.00000 | 1.000000 | 1.000000 | -9.000000 | 0.000000 | 0.000000 | 17.721200 | -159.658000 | 0.000000 | -159.647000 | 0.010000 | 1.000000 |
| 25% | 2010.00000 | 4.000000 | 9.000000 | 0.000000 | 0.000000 | 0.000000 | 33.416025 | -97.688750 | 33.268000 | -97.552550 | 0.380000 | 40.000000 |
| 50% | 2014.00000 | 5.000000 | 17.000000 | 0.000000 | 0.000000 | 0.000000 | 36.897850 | -92.320450 | 36.812950 | -92.041250 | 1.460000 | 75.000000 |
| 75% | 2018.00000 | 7.000000 | 24.000000 | 1.000000 | 0.000000 | 0.000000 | 40.667925 | -86.430950 | 40.620000 | -86.072300 | 4.110000 | 160.000000 |
| max | 2021.00000 | 12.000000 | 31.000000 | 5.000000 | 1500.000000 | 158.000000 | 49.330000 | -64.715100 | 49.330000 | 0.000000 | 168.530000 | 4576.000000 |
## measures of central tendency for Tornado Length
# mean
mean=round(df['Length'].mean(),3)
# median
median=df['Length'].median()
# mode
mode=df['Length'].mode()
#standard deviation
std=df['Length'].std()
#variance
variance=df['Length'].var()
#range
range=df['Length'].max()-df['Length'].min()
print(f'mean: {mean}, median: {median}, mode: {mode}, standard deviation: {std},variance: {variance}, range:{range}')
mean: 3.545, median: 1.46, mode: 0 0.1 Name: Length, dtype: float64, standard deviation: 6.528549858490544,variance: 42.62196325479691, range:168.52
df.hist(column="Year")
#more tornadoes recorded in recent years (or more tornadoes...)
array([[<AxesSubplot:title={'center':'Year'}>]], dtype=object)
df.hist(column="Month")
#may has most tornadoes, pretty normal distribution except for heavier december
array([[<AxesSubplot:title={'center':'Month'}>]], dtype=object)
df.hist(column="Day")
#uniform ish but more tonadoes recorded towards end of month
array([[<AxesSubplot:title={'center':'Day'}>]], dtype=object)
df['Magnitude'].value_counts().plot(kind='barh',title="Tornadoe Magnitude Counts, 1957-2021")
<AxesSubplot:title={'center':'Tornadoe Magnitude Counts, 1957-2021'}>
#injuries df
injuries=df['Injuries'].value_counts()
injuries_df = pd.DataFrame(data=injuries)
#fatalities df
fatalities=df['Fatalities'].value_counts()
fatalities_df = pd.DataFrame(data=fatalities)
#no injuries was by far most common then pretty much in order (1 injury was 2nd common, then 2, then 3, etc ish)
#merging injuries_df & fatalities_df together
injury_fatal=injuries_df.join(fatalities_df)
injury_fatal['Fatalities'] = injury_fatal['Fatalities'].fillna(0)
injury_fatal=injury_fatal.reset_index()
injury_fatal=injury_fatal.rename(columns={"index": "Count"})
injury_fatal.head(15)
| Count | Injuries | Fatalities | |
|---|---|---|---|
| 0 | 0 | 17059 | 17962.0 |
| 1 | 1 | 413 | 144.0 |
| 2 | 2 | 220 | 60.0 |
| 3 | 3 | 103 | 25.0 |
| 4 | 4 | 68 | 15.0 |
| 5 | 5 | 59 | 4.0 |
| 6 | 6 | 31 | 9.0 |
| 7 | 7 | 30 | 6.0 |
| 8 | 8 | 25 | 6.0 |
| 9 | 10 | 23 | 4.0 |
| 10 | 9 | 21 | 4.0 |
| 11 | 12 | 15 | 1.0 |
| 12 | 20 | 12 | 1.0 |
| 13 | 11 | 11 | 3.0 |
| 14 | 15 | 11 | 0.0 |
df[['Length','Injuries','Fatalities']].describe()
| Length | Injuries | Fatalities | |
|---|---|---|---|
| count | 18262.000000 | 18262.000000 | 18262.000000 |
| mean | 3.545453 | 0.833808 | 0.072226 |
| std | 6.528550 | 16.496789 | 1.591000 |
| min | 0.010000 | 0.000000 | 0.000000 |
| 25% | 0.380000 | 0.000000 | 0.000000 |
| 50% | 1.460000 | 0.000000 | 0.000000 |
| 75% | 4.110000 | 0.000000 | 0.000000 |
| max | 168.530000 | 1500.000000 | 158.000000 |
#injury_fatal.to_excel("injury_fatal.xlsx")
#injury_fatal['Fatalities'].plot(x="Count", y=["Injuries", "Fatalities"], kind="bar")
#count of tornadoes by state
print(df['State'].value_counts())
TX 1807 KS 1245 OK 1082 MS 925 AL 923 IA 779 IL 772 MO 759 LA 746 MN 652 CO 625 NE 618 FL 607 GA 606 AR 526 TN 511 KY 460 NC 457 ND 421 IN 414 WI 372 OH 367 SD 351 SC 344 VA 259 PA 258 MI 192 WY 154 NM 151 NY 129 MD 125 CA 106 MT 75 AZ 61 CT 43 ME 37 NJ 36 ID 35 WA 33 MA 33 WV 33 OR 29 UT 23 NV 20 NH 14 DE 14 VT 10 PR 9 HI 6 RI 5 DC 2 VI 1 Name: State, dtype: int64
#top 20 counts by state
df['State'].value_counts()[:20].plot(kind='barh',title="Top 20 States by Tornado Count, 1957-2021")
<AxesSubplot:title={'center':'Top 20 States by Tornado Count, 1957-2021'}>
df.hist(column="Length")
array([[<AxesSubplot:title={'center':'Length'}>]], dtype=object)
df.hist(column="Width")
array([[<AxesSubplot:title={'center':'Width'}>]], dtype=object)
#animation of width and length of tornadoes
fig=px.scatter(df, x="Width", y="Length",
animation_frame="Year",
# animation_group="State", #add if we wanted to do by state averages
color="Magnitude", # this is like "hue" in seaborn, to group by a categorical feature
hover_name="State", # adds labels from a categorical feature when hover over a data point
range_x=[0,3000], # this sets the min and max values to show in the x-axis
range_y=[0,250], # this sets the min and max values to show in the y-axis
title="Tornado Width vs. Length from 1950-2021, by Magnitude", # add a title
)
fig
##deaths by year bar chart
deaths = df.groupby('Year')['Fatalities'].sum().reset_index()
fig = px.bar(deaths , x = 'Year', y = 'Fatalities', color = 'Fatalities', labels = {'Fatalities':'Fatalities','Year':'Year'})
fig.show()
states_fat = df.groupby(['State'])['Fatalities'].sum().reset_index()
states_fat.head()
| State | Fatalities | |
|---|---|---|
| 0 | AL | 301 |
| 1 | AR | 68 |
| 2 | AZ | 0 |
| 3 | CA | 0 |
| 4 | CO | 3 |
states_inj = df.groupby(['State'])['Injuries'].sum().reset_index()
#states_inj.head()
#fatalities by state heatmap
fig = px.choropleth(states_fat,
locations = "State",
locationmode="USA-states",
scope="usa",
color = 'Fatalities' ,
labels={'Year':'Year','State':'State','Fatalities':'Deaths'},
color_continuous_scale= 'YlOrBr')
fig.update_layout(title = "Tornado Fatalities in States 2007-2021")
fig.show()
#injuries by state heatmap
fig2 = px.choropleth(states_inj,
locations = "State",
locationmode="USA-states",
scope="usa",
color = 'Injuries' ,
labels={'Year':'Year','State':'State','Injuries':'Injuries'},
color_continuous_scale= 'YlOrBr')
fig2.update_layout(title = "Tornado Injuries in States 2007-2021")
fig2.show()